Lecture based on chapter 4 from Hal Daumé III, on Kilian Weinberger's lecture 3 and on Tom Mitchell's lecture 1.
"How can we build computer programs that automatically improve their performance through experience?"
How can we learn from data?
How robust is what we learn? What types of assumptions do we make with different approaches? What are the guarantees? How do we pick an approach?
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
plt.rcParams['text.usetex'] = True
import warnings
warnings.filterwarnings('ignore')
def plot_line(ax,xlims, w, do_norm=True):
x1 = np.linspace(xlims[0],xlims[1],1000)
x2_plot = (- w[2] - w[0]*x1)/w[1] # w[0]*x1 + w[1]*x2 + w[2] = 0
ax.plot(x1,x2_plot)
origin = x1[np.array(x1.shape[0]/2).astype(int)], x2_plot[int(x1.shape[0]/2)]
nn = np.linalg.norm(w) if do_norm else 1
ax.quiver(*origin, w[0]/nn,w[1]/nn, color='k',angles='xy', scale_units='xy', scale=1)
ax.axis('equal')
ax.axis([xlims[0],xlims[1],xlims[0],xlims[1]])
ax.set_xlabel(r'$x_1$',fontsize=20); ax.set_ylabel(r'$x_2$',fontsize=20)
ax.grid()
w = [4,0.6]
b = 0
f, ax = plt.subplots(figsize=(5,5))
plot_line(ax, [-5,5], [w[0],w[1],b])
What is a separable dataset?
from sklearn import datasets
X, y = datasets.make_blobs(n_samples=10, centers=np.array([[-1,1],[1,-1]]),
n_features=2, center_box=(0, 10))
y[y==0] = -1
def plot_dataset(ax,X,y):
ax.plot(X[:, 0][y == -1], X[:, 1][y == -1], 'g^')
ax.plot(X[:, 0][y == 1], X[:, 1][y == 1], 'bs');
f, ax = plt.subplots(figsize=(5,5))
plot_dataset(ax,X,y)
We can write ${\bf x}_i$ as:
\begin{align} {\bf x_i}' &= \begin{bmatrix}
{\bf x_i} \\
1
\end{bmatrix}
\end{align}
and incorporate $b$ into ${\bf w}$:
def perceptron_train(X,y,MaxIter=20):
w = np.zeros((X.shape[1]))
for i in range(MaxIter):
m = 0
for (xi,yi) in zip(X,y):
if yi*w.T.dot(xi)<=0:
w = w + yi*xi
m = m + 1
if m==0:
break
return w
f,ax = plt.subplots(figsize=(4,4))
plot_dataset(ax,X,y)
Xprime = np.hstack([X,np.ones((X.shape[0],1))])
w = perceptron_train(Xprime,y)
plot_line(ax,[-4,4], w)
def perceptron_train_and_plot(axs,X,y,MaxIter=20):
w = np.zeros((X.shape[1]))
plot_dataset(axs[0],X,y)
plot_line(axs[0],[-4,4], w)
plt_cnt = 0
for i in range(MaxIter):
m = 0
for (xi,yi) in zip(X,y):
if yi*w.T.dot(xi)<=0:
w = w + yi*xi
m = m + 1
plt_cnt = plt_cnt + 1
try:
if yi == -1:
axs[plt_cnt-1].plot([xi[0]],[xi[1]],'g^',markersize=10)
else:
axs[plt_cnt-1].plot([xi[0]],[xi[1]],'bs',markersize=10)
plot_dataset(axs[plt_cnt],X,y)
plot_line(axs[plt_cnt],[-4,4], w, do_norm=False)
except:
print('not enough subplots')
if m==0:
break
return w
f,axs = plt.subplots(nrows=2,ncols=4,figsize=(20,10))
w = perceptron_train_and_plot(axs.reshape(-1),Xprime,y)
not enough subplots not enough subplots not enough subplots not enough subplots not enough subplots
X, y = datasets.make_blobs(n_samples=10, centers=np.array([[-1,1],[1,-1]]),
n_features=2, center_box=(0, 10))
y[y==0] = -1
Xprime = np.hstack([X,np.ones((X.shape[0],1))])
f,axs = plt.subplots(nrows=2,ncols=4,figsize=(20,10))
w = perceptron_train_and_plot(axs.reshape(-1),Xprime,y)
f, ax = plt.subplots(figsize=(5,5))
plot_line(ax, [-5,5], [3,4,0])
ax.plot([-3],[1],'bs')
[<matplotlib.lines.Line2D at 0x11c503c88>]
The perceptron algorithm converges in $\frac{1}{\gamma^2}$ updates if the data is linearly separable.
$\gamma$ is the margin of the problem instance (defined on next slide).
Given:
If all of the above holds, then the Perceptron algorithm makes at most $\frac{1}{\gamma^2}$ mistakes.